Data Preparation
full_raw_data = read_csv(here::here("Data", "full_ok_cupid.csv"))
get_first_word = Vectorize(function(x){
if (is.na(x)) {
return(x)
}
str_split(x, " ")[[1]][1]
})
can_char_be_factor = function(x, n = 20) {
if (is.character(x)) {
return(length(unique(x)) < n)
}
return(FALSE)
}
get_diet = Vectorize(function(x, importance = FALSE) {
split_char = str_split(x, " ")[[1]]
if (length(split_char) == 1) {
if (importance) {
return(NA)
}
return(x)
}
if (importance) {
return(split_char[1])
}
split_char[2]
})
other_importance = Vectorize(function(x) {
if (is.na(x)) {
return(x)
}
if (str_detect(x, "and")) {
and_split = str_split(x, "and")[[1]]
return(str_trim(and_split[2]))
}
if (str_detect(x, "but")) {
but_split = str_split(x, "but")[[1]]
return(str_trim(but_split[2]))
}
return(NA)
})
get_pets = Vectorize(function(x, pet_type) {
if (is.na(x)) {
return(x)
}
if (str_detect(x, paste("has", pet_type))) {
return(paste("has", pet_type))
}
if (str_detect(x, paste("dislikes", pet_type))) {
return(paste("dislikes", pet_type))
}
if (str_detect(x, paste("likes", pet_type))) {
return(paste("likes", pet_type))
}
return(NA)
})
get_kids = Vectorize(function(x, wants = FALSE) {
if (is.na(x)) {
return(x)
}
if (wants) {
if (str_detect(x, "doesn't want more")) {
return("doesn't want more kids")
}
if (str_detect(x, "doesn't want")) {
return("doesn't want kids")
}
if (str_detect(x, "might want")) {
return("might want kids")
}
if (str_detect(x, "wants more")) {
return("wants more kids")
}
if (str_detect(x, "wants")) {
return("wants kids")
}
} else {
if (str_detect(x, "doesn't have")) {
return("doesn't have kids")
}
if (str_detect(x, "has kids")) {
return("has kids")
}
if (str_detect(x, "has a kid")) {
return("has a kid")
}
}
return(NA)
})
location_data = full_raw_data %>% select(location) %>% unique() %>% bind_cols(geocode(unlist(.)))
san_fran_loc = c(-122.4194155, 37.77493)
all_locations = location_data %>% select(-location) %>% as.matrix()
location_data$distance = geosphere::distGeo(san_fran_loc, all_locations) * 0.000621371 # calculates distance between SF and all other locations and converts from meters to mils
location_data$visiting = location_data$distance > 25 # users are designated as "visiting" if the specified location on their profile is a certain distance from SF
full_cleaned_data = full_raw_data %>%
mutate_if(is.character, str_replace_all, pattern = "’", replacement = "'") %>%
mutate(religion_raw = religion, religion_import = other_importance(religion), religion = get_first_word(religion),
sign_raw = sign, sign_import = other_importance(sign), sign = get_first_word(sign),
diet_raw = diet, diet_import = get_diet(diet, TRUE), diet = get_diet(diet),
dogs = get_pets(pets, "dogs"), cats = get_pets(pets, "cats"), pets_raw = pets, pets = NULL,
kids = get_kids(offspring), kids_import = get_kids(offspring, TRUE), kids_raw = offspring, offspring = NULL,
speaks_en = str_detect(speaks, "english"), multi_ling = str_detect(speaks, ","),
ethnicity_raw = ethnicity, ethnicity = collapse_ethnicity(ethnicity),
last_online_raw = last_online, last_online = ymd_hm(last_online),
year_last_online = year(last_online), month_last_online = month(last_online), day_last_online = day(last_online),
time_since_online = as.period(max(last_online) - last_online), days_since_online = period_to_seconds(time_since_online) %>% (function(x) x / 86400)) %>%
mutate_if(can_char_be_factor, factor) %>% left_join(location_data)
saveRDS(full_cleaned_data, here::here("Data", "full_ok_cupid_cleaned.rds"))
write_csv(full_cleaned_data, here::here("Data", "full_ok_cupid_cleaned.csv"))
Descriptive Statistics
full_data_descriptives = full_cleaned_data %>% skim_to_list()
## Warning: No summary functions for vectors of class: Period.
## Coercing to character
full_data_descriptives[[1]] %>% kable(caption = "Text Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Text Variables
|
variable
|
missing
|
complete
|
n
|
min
|
max
|
empty
|
n_unique
|
|
education
|
6628
|
53318
|
59946
|
10
|
33
|
0
|
32
|
|
essay0
|
5485
|
54461
|
59946
|
1
|
48854
|
0
|
54351
|
|
essay1
|
7571
|
52375
|
59946
|
1
|
7955
|
0
|
51517
|
|
essay2
|
9638
|
50308
|
59946
|
1
|
6129
|
0
|
48635
|
|
essay3
|
11476
|
48470
|
59946
|
1
|
4374
|
0
|
43533
|
|
essay4
|
10537
|
49409
|
59946
|
1
|
44469
|
0
|
49260
|
|
essay5
|
10847
|
49099
|
59946
|
1
|
30446
|
0
|
48964
|
|
essay6
|
13771
|
46175
|
59946
|
1
|
11385
|
0
|
43603
|
|
essay7
|
12450
|
47496
|
59946
|
1
|
3722
|
0
|
45555
|
|
essay8
|
19214
|
40732
|
59946
|
1
|
13304
|
0
|
39325
|
|
essay9
|
12602
|
47344
|
59946
|
1
|
11444
|
0
|
45444
|
|
ethnicity_raw
|
5680
|
54266
|
59946
|
5
|
103
|
0
|
217
|
|
job
|
8198
|
51748
|
59946
|
5
|
33
|
0
|
21
|
|
last_online_raw
|
0
|
59946
|
59946
|
16
|
16
|
0
|
30123
|
|
location
|
0
|
59946
|
59946
|
12
|
35
|
0
|
199
|
|
religion_raw
|
20226
|
39720
|
59946
|
5
|
42
|
0
|
45
|
|
sign_raw
|
11056
|
48890
|
59946
|
3
|
39
|
0
|
48
|
|
speaks
|
50
|
59896
|
59946
|
7
|
107
|
0
|
7647
|
|
time_since_online
|
0
|
59946
|
59946
|
2
|
15
|
0
|
30123
|
full_data_descriptives[[2]] %>% kable(caption = "Categorical Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Categorical Variables
|
variable
|
missing
|
complete
|
n
|
n_unique
|
top_counts
|
ordered
|
|
body_type
|
5296
|
54650
|
59946
|
12
|
ave: 14652, fit: 12711, ath: 11819, NA: 5296
|
FALSE
|
|
cats
|
31323
|
28623
|
59946
|
3
|
NA: 31323, lik: 18450, has: 7274, dis: 2899
|
FALSE
|
|
diet
|
24395
|
35551
|
59946
|
6
|
any: 27881, NA: 24395, veg: 4986, oth: 1790
|
FALSE
|
|
diet_import
|
31734
|
28212
|
59946
|
2
|
NA: 31734, mos: 21508, str: 6704
|
FALSE
|
|
diet_raw
|
24395
|
35551
|
59946
|
18
|
NA: 24395, mos: 16585, any: 6183, str: 5113
|
FALSE
|
|
dogs
|
22512
|
37434
|
59946
|
3
|
lik: 28380, NA: 22512, has: 8493, dis: 561
|
FALSE
|
|
drinks
|
2985
|
56961
|
59946
|
6
|
soc: 41780, rar: 5957, oft: 5164, not: 3267
|
FALSE
|
|
drugs
|
14080
|
45866
|
59946
|
3
|
nev: 37724, NA: 14080, som: 7732, oft: 410
|
FALSE
|
|
ethnicity
|
5680
|
54266
|
59946
|
11
|
whi: 32831, asi: 6134, NA: 5680, mul: 5051
|
FALSE
|
|
kids
|
38895
|
21051
|
59946
|
3
|
NA: 38895, doe: 16132, has: 2461, has: 2458
|
FALSE
|
|
kids_import
|
46885
|
13061
|
59946
|
5
|
NA: 46885, mig: 4403, doe: 4059, wan: 3790
|
FALSE
|
|
kids_raw
|
35561
|
24385
|
59946
|
15
|
NA: 35561, doe: 7560, doe: 3875, doe: 3565
|
FALSE
|
|
orientation
|
0
|
59946
|
59946
|
3
|
str: 51606, gay: 5573, bis: 2767, NA: 0
|
FALSE
|
|
pets_raw
|
19921
|
40025
|
59946
|
15
|
NA: 19921, lik: 14814, lik: 7224, lik: 4313
|
FALSE
|
|
religion
|
20226
|
39720
|
59946
|
9
|
NA: 20226, agn: 8812, oth: 7743, ath: 6985
|
FALSE
|
|
religion_import
|
32007
|
27939
|
59946
|
4
|
NA: 32007, not: 12212, lau: 8995, som: 4516
|
FALSE
|
|
sex
|
0
|
59946
|
59946
|
2
|
m: 35829, f: 24117, NA: 0
|
FALSE
|
|
sign
|
11056
|
48890
|
59946
|
12
|
NA: 11056, leo: 4374, gem: 4310, lib: 4207
|
FALSE
|
|
sign_import
|
23180
|
36766
|
59946
|
3
|
NA: 23180, it’: 19333, it : 16758, it : 675
|
FALSE
|
|
smokes
|
5512
|
54434
|
59946
|
5
|
no: 43896, NA: 5512, som: 3787, whe: 3040
|
FALSE
|
|
status
|
0
|
59946
|
59946
|
5
|
sin: 55697, see: 2064, ava: 1865, mar: 310
|
FALSE
|
full_data_descriptives[[4]] %>% kable(caption = "Dummy Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Dummy Variables
|
variable
|
missing
|
complete
|
n
|
mean
|
count
|
|
multi_ling
|
50
|
59896
|
59946
|
0.51
|
TRU: 30824, FAL: 29072, NA: 50
|
|
speaks_en
|
50
|
59896
|
59946
|
1
|
TRU: 59896, NA: 50
|
|
visiting
|
0
|
59946
|
59946
|
0.043
|
FAL: 57390, TRU: 2556, NA: 0
|
full_data_descriptives[[5]] %>% kable(caption = "Continuous Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Continuous Variables
|
variable
|
missing
|
complete
|
n
|
mean
|
sd
|
p0
|
p25
|
p50
|
p75
|
p100
|
hist
|
|
age
|
0
|
59946
|
59946
|
32.34
|
9.45
|
18
|
26
|
30
|
37
|
110
|
▇▆▂▁▁▁▁▁
|
|
days_since_online
|
0
|
59946
|
59946
|
40.09
|
77.28
|
0
|
1.32
|
3.77
|
32.51
|
370.3
|
▇▁▁▁▁▁▁▁
|
|
distance
|
0
|
59946
|
59946
|
10.47
|
104.38
|
3.4e-05
|
3.4e-05
|
3.4e-05
|
10.42
|
7642.77
|
▇▁▁▁▁▁▁▁
|
|
height
|
3
|
59943
|
59946
|
68.3
|
3.99
|
1
|
66
|
68
|
71
|
95
|
▁▁▁▁▁▇▂▁
|
|
income
|
0
|
59946
|
59946
|
20033.22
|
97346.19
|
-1
|
-1
|
-1
|
-1
|
1e+06
|
▇▁▁▁▁▁▁▁
|
|
lat
|
0
|
59946
|
59946
|
37.77
|
0.33
|
12.24
|
37.77
|
37.77
|
37.8
|
55.95
|
▁▁▁▁▇▁▁▁
|
|
lon
|
0
|
59946
|
59946
|
-122.28
|
2.2
|
-157.86
|
-122.42
|
-122.42
|
-122.27
|
109.2
|
▁▇▁▁▁▁▁▁
|
|
month_last_online
|
0
|
59946
|
59946
|
5.89
|
1.65
|
1
|
6
|
6
|
6
|
12
|
▁▁▁▇▁▁▁▁
|
|
year_last_online
|
0
|
59946
|
59946
|
2011.92
|
0.27
|
2011
|
2012
|
2012
|
2012
|
2012
|
▁▁▁▁▁▁▁▇
|
full_data_descriptives[[6]] %>% kable(caption = "Date-time Variables") %>% kable_styling(bootstrap_options = c("striped", "condensed", "responsive"))
Date-time Variables
|
variable
|
missing
|
complete
|
n
|
min
|
max
|
median
|
n_unique
|
|
last_online
|
0
|
59946
|
59946
|
2011-06-27
|
2012-07-01
|
2012-06-27
|
30123
|
Visualizations
Continuous Variables
full_cleaned_data %>% select_if(is.numeric) %>% select(-time_since_online) %>% ggpairs(progress = FALSE) + theme(axis.text.x = element_text(angle = 20, hjust = 1))

Categorical Variables
is_categorical = function(x) {
is.factor(x) | is.logical(x)
}
full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% pivot_longer(dplyr::everything()) %>% table() %>% as_tibble() %>% ggplot(aes(area = n, fill = value, label = value)) + geom_treemap() + geom_treemap_text(color = "white", place = "centre", grow = TRUE) + facet_wrap(~ name) + theme(legend.position = "none")

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% mutate_all(collapse_to_other, n_categories = 4) %>% pivot_longer(dplyr::everything()) %>% na.omit() %>% ggplot(aes(x = value)) + geom_bar() + scale_x_discrete(labels = abbreviate) + facet_wrap(~ name, scales = "free_x") + theme(axis.text.x = element_text(angle = 20, hjust = 1))

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% na.omit() %>% pivot_longer(-sex) %>% ggplot(aes(fill = sex, x = value)) + geom_bar(position = "fill") + scale_y_continuous(labels = scales::percent) + scale_x_discrete(labels = abbreviate) + facet_wrap(~ name, scales = "free_x") + theme(axis.text.x = element_text(angle = 30, hjust = 1))

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% na.omit() %>% pivot_longer(-orientation) %>% ggplot(aes(fill = orientation, x = value)) + geom_bar(position = "fill") + scale_y_continuous(labels = scales::percent) + scale_x_discrete(labels = abbreviate) + facet_wrap(~ name, scales = "free_x") + theme(axis.text.x = element_text(angle = 30, hjust = 1))

full_cleaned_data %>% select_if(is_categorical) %>% select(-ends_with("raw")) %>% mutate_all(factor) %>% mutate(ethnicity = collapse_to_other(ethnicity, 5)) %>% na.omit() %>% pivot_longer(-ethnicity) %>% ggplot(aes(fill = ethnicity, x = value)) + geom_bar(position = "fill") + scale_y_continuous(labels = scales::percent) + scale_x_discrete(labels = abbreviate) + facet_wrap(~ name, scales = "free_x") + theme(axis.text.x = element_text(angle = 30, hjust = 1))
